Load data


• Neocortex_v3.RData Not necessary if just plotting on already calculated PCA, UMAP, clusters.

# Full dataset:
load("../data/cfc4b2_neocortex_v3.RData")

# 40k random subset (toy dataset)
# ncx.40k <- read_rds("constellation_plots/ncx_v3_40k.rds")

# 100k random subset of excitatory neuron lineage only.
ncx.exn.100k <- read_rds("../data/exn_lineage/toy/ncx.v3.exn.sub_100k.rds")

• Cluster / marker tables.

ncx.clusters <- read_delim("../tbls/83d19_Neocortex_allindividuals_combinedclusters_v1.txt",
                              "\t", escape_double = FALSE, trim_ws = TRUE)

ncx.markers <- read_delim("../tbls/8f0fc_Neocortex_subset1_clustermarkers_combo2.txt", 
                  "\t", escape_double = FALSE, trim_ws = TRUE)

scrattch.hicat steps to build constellation plots.

1. Define cells to build the plot from.

[All cells: 404.2K]

a. All clusters / cell types:

Keep only cells with combo2 $combined.cluster.2 annotation.

b. Excitatory neuron lineage only:

Keep only cells with combo2 $combined.cluster.2 annotation AND whose $combined.cluster.2 annotation belongs to excitatory neuronal lineage classes.

b.1. Cells in exn linage 100k cell toy object:

2. Build dataframes for constellation plots.

1. cl

2. rd.dat

3. cl.df

4. rd.cl.center Find cluster centers from UMAP coordinates

5. cl.center.df

Join cl.df and rd.cl.center into cl.center.df for input into get_KNN_graph.

6. knn.cl


Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count

3. Make constellation plot


2. Find DE genes between connected clusters.


Neuron_3 cells with nearest neighbors in Neuron_8 cell.cl.counts is a matrix of all cells and their counts of nearest neighbors in each cluster.

Find cells in cluster_a with bearest neighbors in cluster_b

nn.counts <- x[[cluster_b_col]]
length(x$cell.name)
[1] 10663
sum(nn.counts > 0)
[1] 6103
sum(nn.counts == 0)
[1] 4560
(median.nnCounts <- median(nn.counts[nn.counts > 0]))
[1] 5

Distribution of neighbor counts in cluster_b for cells in a given cluster_a. Use this to find the point at which cells will be split into comparison groups.

Compare cells above and below the median count of cluster_b neighbors.

cells <- list()
cells$above.median <- x %>% filter(get(cluster_b_col) > median) %>% pull(cell.name)
cells$below.median <- x %>% filter(get(cluster_b_col) < median)  %>% pull(cell.name)

s.obj <- SetIdent(s.obj, cells = cells$above.median, value = 'nn_ct_above_med') %>% 
          SetIdent(cells = cells$below.median, value = 'nn_ct_below_med')

table(s.obj@active.ident) %>% as.data.frame.matrix()
markers <- list()
markers$nn_above_med <- FindMarkers(s.obj, slot = "scale.data", 
                                  # cells.1 = cells$above.median, cells.2 = cells$below.median,
                                  ident.1 = "nn_above_med", ident.2 = "nn_below_med", 
                                  logfc.threshold = 0)

Calculate enrichment ratio, filter genes w. adj p-value < 0.05, sort table. Positive values indicate that the feature is more highly expressed in the first group.


markers.tmp <- markers$nn_above_med %>% rownames_to_column("gene") %>%
  mutate(enrich.ratio = pct.1 / pct.2,
         gene.score = avg_diff * enrich.ratio,
         across(.cols = where(is.numeric), .fns = round, digits = 5)
  ) %>%
  filter(p_val_adj <= 0.05) %>% 
  select(gene, pct.1, pct.2, enrich.ratio, avg_diff, gene.score, p_val_adj) %>%
  arrange(desc(gene.score))

write_tsv(markers.tmp, "../out/DEgenes_neuron3_vs_neuron8.tsv")

Make heatmap:

Extra functions

---
title: "Constellation Plots: Neocortex, 2nd Trimester" 
subtitle: "All Samples, All cell types"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

```{r global_options, include=FALSE}

knitr::opts_chunk$set(fig.width=12, fig.height=8, fig.path = 'Figs/',
                      echo = FALSE, warning = FALSE, message = FALSE)
```

```{r, echo=FALSE}
source_rmd("scrattch.hicat_fxns.Rmd")
```

# Load data
*** 

• Neocortex_v3.RData
Not necessary if just plotting on already calculated PCA, UMAP, clusters.
```{r load-neocortex, eval = FALSE, echo=TRUE}
# Full dataset:
load("../data/cfc4b2_neocortex_v3.RData")

# 40k random subset (toy dataset)
# ncx.40k <- read_rds("constellation_plots/ncx_v3_40k.rds")

# 100k random subset of excitatory neuron lineage only.
ncx.exn.100k <- read_rds("../data/exn_lineage/toy/ncx.v3.exn.sub_100k.rds")
```

• Cluster / marker tables.
```{r load-tbls,  echo=TRUE, paged.print=TRUE}
ncx.clusters <- read_delim("../tbls/83d19_Neocortex_allindividuals_combinedclusters_v1.txt",
                              "\t", escape_double = FALSE, trim_ws = TRUE)

ncx.markers <- read_delim("../tbls/8f0fc_Neocortex_subset1_clustermarkers_combo2.txt", 
                  "\t", escape_double = FALSE, trim_ws = TRUE)
```

***

# `scrattch.hicat` steps to build constellation plots.

## 1. Define cells to build the plot from.
[All cells: 404.2K]
  
## a. All clusters / cell types:
Keep only cells with *_combo2_* `$combined.cluster.2` annotation.
```{r eval=FALSE}

cells.cl.df <-  ncx.clusters %>% filter(str_detect(combined.cluster.2, "combo2"))

s.obj <- Neocortex %>% subsetSeurat(cells.keep = cells.cl.df$cell.name)
# 348K cells

# Sanity check
cells.cl.df$cell.name %>% identical(s.obj@meta.data %>% rownames)
[TRUE]
```

### b. Excitatory neuron lineage only:

Keep only cells with *_combo2_* `$combined.cluster.2` annotation AND
whose `$combined.cluster.2` annotation belongs to excitatory neuronal lineage classes.
```{r eval=FALSE}
cells.cl.df <- ncx.clusters.exn <- 
                  ncx.clusters %>% 
                      filter(str_detect(combined.cluster.2, "combo2") ) %>% 
                        filter(str_detect(combined.cluster.2, "Neuron|CR|Dividing|RG|IPC|OPC") )


s.obj <- Neocortex %>% subsetSeurat(cells.keep = cells.cl.df$cell.name)

# Remove @counts slot (5GB)
s.obj@assays$RNA@counts <- matrix(c(0,0))
# 271.4K cells in excitatory lineage.

# Sanity check:
cells.cl.df$cell.name %>% identical(s.obj@meta.data %>% rownames)
# [TRUE]

s.obj@meta.data <- left_join(x = s.obj@meta.data %>% select(-orig.ident) %>% rownames_to_column("cell.name"),
                            y = cells.cl.df[ , c("cell.name", "combined.cluster.2")]) %>% 
                      set_rownames(.$cell.name)

write_rds(s.obj, "../data/exn_lineage/neocortex_exn_lineage_271k.rds", compress = "gz")
```

### b.1. Cells in exn linage 100k cell toy object:
```{r}
# Define which object the constellation plot is based on:
s.obj <- ncx.exn.100k

cells.cl.df <- ncx.clusters %>% 
                  filter(cell.name %in% rownames(s.obj@meta.data) )
```


## 2. Build dataframes for constellation plots.

### 1. `cl` 
```{r}
cl <- cells.cl.df$combined.cluster.2 %>% as.factor %>% 
          set_names(cells.cl.df$cell.name)
# 128 clusters in all _combo2_ cells
# 77 clusters in all _combo2_ & ExN lineage cells.
```

### 2. `rd.dat`
```{r}
rd.dat <- list(umap = s.obj@reductions$umap@cell.embeddings,
               pca = s.obj@reductions$pca@cell.embeddings)

# Subset UMAP and PCA cell embeddings: keep only cells in ncx.clusters.
# (Exclude cells in cluster 0 and keep only cells with "combo2" in combined cluster 2 name.)
# rd.dat %<>% lapply(function(x) x[names(cl), ])
```

### 3. `cl.df`
```{r}
cl.df <- get_cl_df(cl)

cl.df$clade <- str_split_fixed(cl.df$cluster_label, "_", 2)[ ,1] %>% tolower 
# Add clade_id, clade_color to cl.df
clade.cols <- data.frame(# clade = unique(cl.df$clade),
                        cluster_color = c("cr"= "darkgrey", 
                                  "dividing" = "darkkhaki", 
                                  "neuron" = "deepskyblue", 
                                  "inteneuron" = "deeppink2", 
                                  "ipc" = "brown4", 
                                  "microglia" = "darkorchid1",
                                  "opc" = "cadetblue", 
                                  "other" = "darkslateblue", 
                                  "rg" = "darkorange", 
                                  "vascular" = "blanchedalmond")
            ) %>% rownames_to_column("clade")

cl.df %<>% left_join(clade.cols)
rm(clade.cols)

# cells.cl.df: Add cluster_id column from cl.df; remove unused columns. 
cells.cl.df <- left_join(cells.cl.df %>% select(cell.name, cell.type, combined.cluster.2),
                           cl.df %>% select(cluster_label, cluster_id), 
                           by = c("combined.cluster.2" = "cluster_label")
                ) %>% mutate(cluster_id = as.factor(cluster_id))
```

### 4. `rd.cl.center` Find cluster centers from UMAP coordinates
```{r}
rd.cl.center <- get_RD_cl_center(rd.dat$umap, cl) 

rd.cl.center %<>% 
  as.data.frame %>% 
  set_names(c("x", "y")) %>%
  add_column(cl = 1:nrow(rd.cl.center), .before = "x") %>%
  # add_column preserves rownames.
  # but moving rownames to column cluster_label anyway bc of left_join below.
  rownames_to_column("cluster_label")
```

#### 5. `cl.center.df`
Join `cl.df` and `rd.cl.center` into `cl.center.df` for input into `get_KNN_graph`.
```{r}
cl.center.df <- left_join(rd.cl.center, cl.df,
                           by = c("cluster_label")) 
```

#### 6. `knn.cl` 
```{r}
# Fixes needed for proper output of knn.cl.df
# cl.center.df %<>% rename(cluster_size = "size")
# levels(cl) <- cl.df$cluster_id
cl.numeric <- cl
levels(cl.numeric) <- cl.df$cluster_id

knn.result <- RANN::nn2(data = rd.dat$pca[, 1:10], k = 15)

knn.cl <- get_knn_graph(rd.dat = rd.dat$pca[ , 1:10], 
                        cl.df =  cl.df, cl = cl.numeric, 
                        k = 15, 
                        knn.outlier.th = 2, outlier.frac.t = 0.5)

rm(rd.dat, ncx.clusters)
```

## 3. Make constellation plot
```{r make-constellation, eval = FALSE}
# Keep only cells where $frac [fraction of cells in cluster with nearest neighbors in different cluster] >= 0.05.
# Defined in `get_knn_graph`: 
# knn.cl.df$frac = knn.cl.df$Freq / knn.cl.df$cl.from.total
# 10% : 213 edges
knn.cl.df <- knn.cl$knn.cl.df  %>% filter(frac >= 0.1)

# Plot only edges between ExN lineage clusters.
# knn.cl.df %<>% filter_at(vars(cl.from.label, cl.to.label), 
#                        all_vars(str_detect(., "RG|IPC|Neuron|OPC|Dividing"))
#              )

cl.center.df$cluster_label %<>% str_remove("_combo2")
  
cl.plot <- plot_constellation(knn.cl.df, cl.center.df, out.dir = "../out",
                              node.label = "cluster_label", exxageration = 1, curved = TRUE, 
                              plot.parts = FALSE, plot.hull = NULL, 
                              plot.height = 40, plot.width = 40,
                              node.dodge = TRUE, label.size = 2, max_size = 20)

```

***
# 2. Find DE genes between connected clusters.
***
```{r}
# Dataframe w/ the proportion of k(15) nearest neighbors in each cluster for every cell.
nn.cl.df <- knn.cl$pred.result$pred.prob %>% as.data.frame

cl.df$cluster_id %<>% as.factor()
# Possibly move to top, before making all DFs.
cells.cl.df %<>% rename(cluster_label = "combined.cluster.2") %>% 
                  mutate(cluster_label = str_remove(cluster_label, "_combo2"))

cl.df %<>% mutate(cluster_label = str_remove(cluster_label, "_combo2"))


# Add column with cells' own cluster assignment from `cells.cl.df`.
nn.cl.df %<>% left_join(cells.cl.df,
                        by = c("query" = "cell.name")
                        )
# Add cluster_label corresponding to nn.cl.

nn.cl.df %<>% left_join(cl.df %>% select(cluster_label, cluster_id),
                        by = c("nn.cl" = "cluster_id"))

nn.cl.df %<>% select(query, cluster_id_self = "cluster_id", 
                    cluster_label_self = "cluster_label.x",
                    cluster_id_nn = "nn.cl",
                    cluster_label_nn = "cluster_label.y",
                    freq = "Freq")

# nn.cl.df %<>% select(query, combined.cluster.2, cluster_id, nn.cl, )

```

```{r}
x <- filter(knn.cl$knn.cl.df, frac >= 0.1 & cl.from != cl.to) %>% arrange(cl.from)
```

```{r}
knn.cl$knn.cl.cl.counts %>% head
```

```{r}
cl.df

x <- filter(nn.cl.df, cluster_label_self == "Neuron_8" & cluster_label_nn == "Neuron_3")
# 12,201 cells in Neuron_8

x %>% filter(freq > 0) %>% 
  ggplot() + geom_density(aes(freq))
```

Neuron_3 cells with nearest neighbors in Neuron_8
`cell.cl.counts` is a matrix of all cells and their counts of nearest neighbors in each cluster.

```{r}
cell.cl.counts <- knn.cl$knn.cell.cl.counts %>% as.data.frame.matrix %>% rownames_to_column("cell.name")

cell.cl.counts <- left_join(cells.cl.df, cell.cl.counts,  by = "cell.name") %>%
  select(-cell.type) %>% mutate(combined.cluster.2 = str_remove(combined.cluster.2, "_combo2")) %>%
  rename(cluster_label = "combined.cluster.2")


names(cell.cl.counts)[4:nrow(cell.cl.counts)] %<>% paste0("cl_", .)
```

# Find cells in `cluster_a` with bearest neighbors in `cluster_b`
```{r echo=TRUE}
cluster_a <- "IPC_6"
cluster_b <- "IPC_3"

cluster_b_col <- cell.cl.counts %>% filter(cluster_label == cluster_b) %>% 
                .$cluster_id %>% as.numeric %>% unique %>% paste0("cl_", .)


x <- cell.cl.counts %>% filter(cluster_label == cluster_a) %>% replace(is.na(.), 0)

nn.counts <- x[[cluster_b_col]]
n.cells <- length(x$cell.name)
sum(nn.counts > 0)
sum(nn.counts == 0)

(median.nnCounts <- median(nn.counts[nn.counts > 0]))
```

Distribution of neighbor counts in cluster_b for cells in a given cluster_a.
Use this to find the point at which cells will be split into comparison groups.
```{r}
  ggplot(x) + 
    ggtitle(paste(cluster_a, "cells \n n/k=15 nearest neighbors in", cluster_b)) +
    
    geom_bar(aes(x = get(cluster_b_col))) +
    geom_vline(xintercept = median.nnCounts, colour = "red") +
    annotate("text", x = median.nnCounts + 1, y = quantile(1: n.cells, .07),
             label = paste0("median = ", median.nnCounts)
  ) + 
    xlab(paste("# of", cluster_b, "neighbors")) +
    ylab("") +
    theme_minimal() 
```

Compare cells above and below the median count of `cluster_b` neighbors.
```{r echo=TRUE}
cells <- list()
cells$above.median <- x %>% filter(get(cluster_b_col) > median) %>% pull(cell.name)
cells$below.median <- x %>% filter(get(cluster_b_col) < median)  %>% pull(cell.name)

s.obj <- SetIdent(s.obj, cells = cells$above.median, value = 'nn_ct_above_med') %>% 
          SetIdent(cells = cells$below.median, value = 'nn_ct_below_med')

table(s.obj@active.ident) %>% as.data.frame.matrix()
markers <- list()
markers$nn_above_med <- FindMarkers(s.obj, slot = "scale.data", 
                                  # cells.1 = cells$above.median, cells.2 = cells$below.median,
                                  ident.1 = "nn_above_med", ident.2 = "nn_below_med", 
                                  logfc.threshold = 0)
```

Calculate enrichment ratio, filter genes w. adj p-value < 0.05, sort table.
Positive values indicate that the feature is more highly expressed in the first group.
```{r filter-markers, echo=TRUE}

markers.tmp <- markers$nn_above_med %>% rownames_to_column("gene") %>%
  mutate(enrich.ratio = pct.1 / pct.2,
         gene.score = avg_diff * enrich.ratio,
         across(.cols = where(is.numeric), .fns = round, digits = 5)
  ) %>%
  filter(p_val_adj <= 0.05) %>% 
  select(gene, pct.1, pct.2, enrich.ratio, avg_diff, gene.score, p_val_adj) %>%
  arrange(desc(gene.score))

write_tsv(markers.tmp, "../out/DEgenes_neuron3_vs_neuron8.tsv")
```

```{r render-table}
reactable(markers.tmp, defaultPageSize = 100,
          showSortable = TRUE, resizable = TRUE, highlight = TRUE, filterable = TRUE, minRows = 10,
          style = list(fontFamily = "Work Sans, sans-serif", fontSize = "12px")
  )
```

```{r}
# saveWidget(markers.tmp, file = )

# Same genes in both comparisons (sanity check)
xtab_set <- function(A,B){
              both    <-  union(A,B)
              inA     <-  both %in% A
              inB     <-  both %in% B
              return(table(inA,inB))
            }
xtab_set(markers$nn_above_med$geme, markers$nn_below_med$gene)
```

Make heatmap:
```{r heatmap}
tmp.sobj <- subsetSeurat(s.obj, cells.keep = flatten_chr(cells))

min(tmp.sobj@assays$RNA@scale.data)

markers.plot <- markers.tmp  %>% filter(enrich.ratio >= 1.3 | enrich.ratio <= 0.7) %>% arrange(desc(enrich.ratio))

exp.plot <- tmp.sobj@assays$RNA@scale.data[markers.plot$gene, ]

(exp.limits <- exp.plot %>% as.numeric %>%  quantile(c(0, 0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99, 1)))

heatmap <- 
  DoHeatmap(tmp.sobj, 
          # cells = flatten_chr(cells),
          features =  markers.plot$gene,
          disp.min = exp.limits["1%"],
          disp.max = exp.limits["99%"],
          angle = 0
          # slot = "data"
          # slim.col.label = TRUE,
          # remove.key = TRUE
) +
  theme(legend.text = element_text(size = 8),
        legend.position = "bottom",
    #legend.position = "none",
        text = element_text(size = 8),
        aspect.ratio = c(2,1)
  ) +
    scale_fill_viridis(end = 1, na.value = 'white', option = "magma", discrete = FALSE)

# ggsave(filename = "../DEgenes_neuron3_vs_neuron8_heatmap.pdf", width = 6, height = 4, units = "in" )

p <- ggplotly(heatmap, tooltip = "Feature", width = 1000, height = 800) %>% layout(legend = list(yanchor = 'bottom', orientation = 'h')) %>% partial_bundle() 

plotly_json(p)
p$x$data[c(2,3)] <- NULL

  saveWidget("~/carmensandoval.github.io/arklab/2nd-tri/clustering/DEgenes_neuron3_vs_neuron8_heatmap.html", selfcontained = TRUE)
```

Extra functions
```{r functions}
.env$source_rmd <- function(file, local = FALSE, ...){
  options(knitr.duplicate.label = 'allow')

  tempR <- tempfile(tmpdir = ".", fileext = ".R")
  on.exit(unlink(tempR))
  knitr::purl(file, output = tempR, quiet = TRUE)

  envir <- globalenv()
  source(tempR, local = envir, ...)
}

.env$reactable <- function(...) {
  htmltools::tagList(reactable::reactable(...))
}

# source("../../../code_general/setup_R_session_CSE.R")

attach(.env)
```